# replication for "Identifying the Role of High School in Educational Inequality: A Causal Mediation Approach."
## R version 4.4.1 (2024-06-14)

# remove objects but note that this approach lacks reproducibility and is environment-dependent
rm(list = ls())

# Packages ----------------------------------------------------------------
pacman::p_load(here,
               tidyverse,
               tictoc,
               Amelia,
               parallel,
               miceadds,
               estimatr,
               knitr,
               magrittr,
               EValue,
               devtools)

# Install additional packages
devtools::install_github(c(
  "xiangzhou09/rwrmed",
  "BS1125/CMAverse"
))

library(rwrmed)  # https://github.com/xiangzhou09/rwrmed
library(CMAverse)  # https://bs1125.github.io/CMAverse/


# Configuration -----------------------------------------------------------
REP <- 3    # Set to 1000 for final analysis
M_num <- 3  # Set to 80 for final analysis

# Load data ---------------------------------------------------------------
load(here("JLPSJ_replication_data.RData"))
names(JLPSJ_replication_data)


# Cross product terms -----------------------------------------------------
d_imp <- JLPSJ_replication_data |> 
  mutate(AM = A * M,
         AX1 = A * X1,
         AX2 = A * X2,
         AX3 = A * X3,
         AX4 = A * X4,
         AX5 = A * X5,
         AX6 = A * X6,
         AX7 = A * X7,
         AX8 = A * X8,
         AX9 = A * X9,
         AX10 = A * X10,
         AX11 = A * X11,
         MX1 = M * X1,
         MX2 = M * X2,
         MX3 = M * X3,
         MX4 = M * X4,
         MX5 = M * X5,
         MX6 = M * X6,
         MX7 = M * X7,
         MX8 = M * X8,
         MX9 = M * X9,
         MX10 = M * X10,
         MX11 = M * X11,
         MZ1 = M * Z1,
         MZ2 = M * Z2,
         MZ3 = M * Z3,
         MZ4 = M * Z4,
         MZ5 = M * Z5,
         MZ6 = M * Z6,
         MZ7 = M * Z7,
         MZ8 = M * Z8,
         MZ9 = M * Z9,
         MZ10 = M * Z10,
         MZ11 = M * Z11,
         MZ12 = M * Z12,
         AW1 = A * W1,
         AW2 = A * W2,
         AW3 = A * W3,
         AW4 = A * W4,
         AW5 = A * W5,
         AW6 = A * W6,
         AW7 = A * W7,
         AW8 = A * W8,
         AW9 = A * W9,
         AW10 = A * W10,
         AW11 = A * W11,
         AW12 = A * W12,
         AW13 = A * W13,
         AW14 = A * W14,
         AW15 = A * W15,
         AW16 = A * W16,
         AW17 = A * W17,
         AW18 = A * W18,
         AW19 = A * W19,
         AW20 = A * W20,
         AW21 = A * W21,
         MW1 = M * W1,
         MW2 = M * W2,
         MW3 = M * W3,
         MW4 = M * W4,
         MW5 = M * W5,
         MW6 = M * W6,
         MW7 = M * W7,
         MW8 = M * W8,
         MW9 = M * W9,
         MW10 = M * W10,
         MW11 = M * W11,
         MW12 = M * W12,
         MW13 = M * W13,
         MW14 = M * W14,
         MW15 = M * W15,
         MW16 = M * W16,
         MW17 = M * W17,
         MW18 = M * W18,
         MW19 = M * W19,
         MW20 = M * W20,
         MIS = is.na(Yyear) |> as.numeric()
  )


# Imputation --------------------------------------------------------------
# Temporarily convert years of education to ordinal scale (1,2,3) for imputation
# This will be converted back to original scale after imputation
d_imp <- d_imp |> 
  mutate(Yyear = case_match(Yyear,    # 
                            12 ~ 1,
                            14 ~ 2,
                            16 ~ 3,
                            .default = NA)) |> 
  select(-education)

tic()
set.seed(123456)
imp_amelia <- Amelia::amelia(data.frame(d_imp), 
                     m = M_num,
                     idvars = c("ID", "MIS"),
                     parallel = "multicore",
                     noms = c("Yuniv","X1","X9","Z2","Z3","W1","W2","W3","W4","W5","Z6","Z7","Z12"),
                     ords = c("Yyear","X3","X4","X7","X8","Z4","Z5"),
                     bounds = rbind(c(4,25,75),  # Yhensa
                                    c(25,0,2.4), # Z8: Study time
                                    c(26,0,2.4), # Z9: Study time
                                    c(27,0,2.4), # Z10: Study time
                                    c(28,0,2.4)), # Z11: Study time
                     ncpus = parallel::detectCores()
)
toc()

imp_amelia_raw <- imp_amelia
imp_amelia <- imp_amelia |> transform(
                        A = rank(A) / nrow(d_imp),
                        M = rank(M) / nrow(d_imp),
                        Yyear = case_match(Yyear,
                                           1 ~ 12,
                                           2 ~ 14,
                                           3 ~ 16,
                                           .default = NA)
                        )
imp_amelia <- imp_amelia |> transform(
  Yuniv = ifelse(Yyear >= 16,1,0)
  )
imp_amelia <- imp_amelia |> transform(
  Yrank = ifelse(Yyear < 16,Yyear,Yhensa)
  )
imp_amelia <- imp_amelia |> transform(
  Yrank = rank(Yrank) / nrow(d_imp)
  )

imps_raw <- imp_amelia$imputations
imp_data <- miceadds::datlist2mids(imp_amelia$imputations)

# Descriptive statistics
imps_raw_merge <- complete(imp_data, "long")

# Table 1 -----------------------------------------------------------------
Table_1 <- list()
Table_1[[1]] <- imps_raw_merge %>% 
  select(-c(.imp,.id,ID)) %>%
  summarise(across(everything(), list(mean = ~mean(.),
                                      sd = ~sd(.),
                                      min = ~min(.),
                                      max = ~max(.),
                                      n = ~sum(!is.na(.))/M_num))) %>%
  pivot_longer(-0, names_to = "V", values_to = "val") %>%
  separate(col = V, into = c("Var","stat"), sep = "_") %>%
  pivot_wider(id_cols = Var, names_from = "stat", values_from = "val")

Table_1[[2]] <- Table_1[[1]][1:28,] %>%
  mutate(Node = c("Y (year)","Y (univ)","Y (rank)","A","M",paste0("X",1:11),paste0("Z",1:12)),
         Variable = c("Years of education",
                      "University enrolment",
                      "Rank of university",
                      "Family income",
                      "High school selectivity",
                      "Women",
                      "Parental SEI / 10",
                      "Father's education",
                      "Mother's education",
                      "Neighborhood",
                      "Distance to the nearest Univ",
                      "Number of siblings",
                      "Birth order",
                      "Grandparents' education",
                      "Mother's age / 10",
                      "Birth month (April = 0)",
                      "GPA at 9th grade",
                      "Child's University expectation at 9th grade",
                      "Mother's University expectation for the child at 9th grade",
                      "Child's subjective wealth",
                      "Mother's subjective wealth",
                      "Cram school",
                      "Private tutor",
                      "Study time at non-school facilities (weekdays)",
                      "Study time at home (weekdays)",
                      "Study time at non-school facilities (weekends)",
                      "Study time at home (weekends)",
                      "Private or national JH")) %>% 
  select(Node, Variable, Mean = mean, SD = sd, Min = min, Max = max, N = n)
Table_1[[2]] |> print(n = Inf)


# Figure 2 ----------------------------------------------------------------
# Distributions of (A) the Absolute Educational Attainment, (B) Selectivity of University (Hensachi), and (C) the Relative Educational Attainment.
Hensa <- 
  imps_raw_merge %>%
  filter(.imp == 1 & Yuniv == 1) %>% 
  ggplot(aes(x = Yhensa)) +
  geom_histogram(aes(y = ..count../sum(..count..)), alpha = 0.8, fill = "green") + 
  labs(x = "(B) Standardized Rank Score of University (Hensachi)", y = "Proportion") + 
  annotate(geom = "text", x = 60, y = 0.075, label = "Four-year university") + 
  #scale_fill_grey() + 
  theme_minimal()

Relative0 <- 
  imps_raw_merge %>% 
  filter(.imp == 1) %>% 
  mutate(col = ifelse(Yyear <= 12 & !is.na(Yyear), 1, 
                      ifelse(Yyear <= 14 & !is.na(Yyear), 2, 
                             ifelse(Yyear <= 16 & !is.na(Yyear), 3, NA))) %>% 
           factor()) %>% 
  mutate(Yyear = rank(Yyear) / nrow(.)) %>%
  select(Yyear,col) %>%
  ggplot(aes(x = Yyear, fill = col, group = col)) +
  geom_histogram(aes(y = ..count../sum(..count..)), alpha = 0.8) + 
  labs(x = "(A) Rank of Education", y = "Proportion") + 
  coord_cartesian(xlim = 0:1, ylim = c(0,.6)) + 
  annotate(geom = "text", x = 0.09, y = 0.2, label = "High School") + 
  annotate(geom = "text", x = 0.29, y = 0.3, label = "Professional Training School\n Junior College\n Technical College") + 
  annotate(geom = "text", x = 0.8, y = 0.4, label = "Four-year University") + 
  scale_fill_manual(values = c("pink", "blue","green")) + 
  #scale_fill_grey() + 
  theme_minimal() + 
  theme(legend.position = "none")

Relative <- 
  imps_raw_merge %>% 
  filter(.imp == 1) %>% 
  mutate(col = ifelse(Yyear <= 12, 1, 
                      ifelse(Yyear <= 14, 2, 3)) %>% factor()) %>% 
  select(Yrank,col) %>%
  ggplot(aes(x = Yrank, fill = col, group = col)) +
  geom_histogram(aes(y = ..count../sum(..count..)), alpha = 0.8) + 
  labs(x = "(C) New Rank of Education", y = "Proportion") + 
  coord_cartesian(xlim = 0:1, ylim = c(0,0.6)) + 
  annotate(geom = "text", x = 0.09, y = 0.2, label = "High school") + 
  annotate(geom = "text", x = 0.29, y = 0.3, label = "Professional Training School\n Junior College\n Technical College") + 
  annotate(geom = "text", x = 0.70, y = 0.075, label = "Four-year University") + 
  scale_fill_manual(values = c("pink", "blue","green")) + 
  #scale_fill_grey() + 
  theme_minimal() + 
  theme(legend.position = "none")

fig_2 <- ggpubr::ggarrange(Relative0, Hensa, Relative, ncol = 3)
fig_2
ggsave("fig_2.png", 
       fig_2,
       dpi = 600,
       device = "png", height = 4, width = 12)


# Centering ---------------------------------------------------------------
imp_amelia <- transform(imp_amelia, 
                        A = A - mean(A),
                        M = M - mean(M),
                        X1 = X1 - mean(X1),
                        X2 = X2 - mean(X2),
                        X3 = X3 - mean(X3),
                        X4 = X4 - mean(X4),
                        X5 = X5 - mean(X5),
                        X6 = X6 - mean(X6),
                        X7 = X7 - mean(X7),
                        X8 = X8 - mean(X8),
                        X9 = X9 - mean(X9),
                        X10 = X10 - mean(X10),
                        X11 = X11 - mean(X11),
                        Z1 = Z1 - mean(Z1),
                        Z2 = Z2 - mean(Z2),
                        Z3 = Z3 - mean(Z3),
                        Z4 = Z4 - mean(Z4),
                        Z5 = Z5 - mean(Z5),
                        Z6 = Z6 - mean(Z6),
                        Z7 = Z7 - mean(Z7),
                        Z8 = Z8 - mean(Z8),
                        Z9 = Z9 - mean(Z9),
                        Z10 = Z10 - mean(Z10),
                        Z11 = Z11 - mean(Z11),
                        Z12 = Z12 - mean(Z12)
                        )
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   
write_csv(Table_1[[1]][-1,], "table1_1.csv")
write_csv(Table_1[[2]][-1,], "table1_2.csv")


# Conventional regression models ------------------------------------------

for (i in c("Yuniv","Yrank")) {
  
  Y <- i
  
  ifelse(Y == "Yuniv",
         imp_amelia <- transform(imp_amelia, Y = Yuniv),
         imp_amelia <- transform(imp_amelia, Y = Yrank)
  )
  imp_amelia_raw <- imp_amelia
  imps_raw <- imp_amelia_raw$imputations
  imp_data <- datlist2mids(imp_amelia$imputations)
  
  # centering
  imp_amelia <- transform(imp_amelia, Y = Y - mean(Y))

  # ANALYSIS 1
  # M model
  fit <- list()
  fit[[1]] <- imp_data %>%
    with(lm_robust(M ~ A)) %>% pool() %>% tidy(conf.int = TRUE)
  fit[[2]] <- imp_data %>%
    with(lm_robust(M ~ A + X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10 + X11)) %>% pool() %>% tidy(conf.int = TRUE)
  fit |> map(~kable(.,digits = 3, format = "simple"))
  
  fig3_coef_1 <- bind_rows(fit, .id = "Model") %>% 
    filter(term == "A") %>% 
    mutate(Model = factor(Model, 
                          levels = 1:2,
                          labels = c("(1) Treatment Only ",
                                     "(2) Treatment + \n Pretreatment Covariates\n\n"))) 
  
  FIG_med_out <- fig3_coef_1 %>% 
    ggplot(aes(x = Model, y = estimate, ymin = conf.low, ymax = conf.high)) +
    geom_hline(yintercept = 0, color = "black") + 
    geom_text(aes(label = paste0(format(round(estimate,3), nsmall = 3),"\n(",
                                 format(round(std.error,3), nmall = 3),")")), nudge_x = 0.15) + 
    geom_pointrange() + 
    labs(y = "Coefficient of Family Income") + 
    theme_minimal() +
    ggtitle(label = "(A) Outcome = Standardized Rank Scores of High School")
  FIG_med_out

  # ANALYSIS 2
  # Y model
  fit <- list()
  fit[[1]] <- imp_data %>%
    with(lm_robust(Y ~ M)) %>% pool() %>% tidy(conf.int = TRUE)
  fit[[2]] <- imp_data %>%
    with(lm_robust(Y ~ M + A + X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10 + X11)) %>% pool() %>% tidy(conf.int = TRUE)
  fit[[3]] <- imp_data %>%
    with(lm_robust(Y ~ M + A + X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10 + X11 + Z1 + Z2 + Z3 + Z4 + Z5 + Z6 + Z7 + Z8 + Z9 + Z10 + Z11 + Z12)) %>% pool() %>% tidy(conf.int = TRUE)
  fit |> map(~kable(.,digits = 3, format = "simple"))
  
  
  fig3_coef_2 <- bind_rows(fit, .id = "Model") %>% 
    filter(term == "M") %>% 
    mutate(Model = factor(Model, 
                          levels = 1:3,
                          labels = c("(1) Mediator Only ",
                                     "(2) Mediator + \n Treatment + \n Pretreatment Covariates",
                                     "(3) Mediator + \n Treatment + \n Pretreatment Covariates + \n Intermediate Covariates")))
  FIG_med_coef <- 
    fig3_coef_2 %>% 
    ggplot(aes(x = Model, y = estimate, ymin = conf.low, ymax = conf.high)) +
    geom_hline(yintercept = 0, color = "black") + 
    geom_text(aes(label = paste0(format(round(estimate,3), nsmall = 3),"\n(",
                                 format(round(std.error,3), nmall = 3),")")),
              nudge_x = 0.2) + 
    geom_pointrange() + 
    labs(y = "Coefficient of the Standardized Rank Scores of High School") + 
    theme_minimal() + ggtitle(label = "(B) Outcome = University Enrollment")
  FIG_med_coef
  
  # Figures E1 and E3 ----------------------------------------------------------------
    ifelse(Y == "Yuniv",
         ggpubr::ggarrange(FIG_med_out,FIG_med_coef) %>% 
           ggsave("fig_E1.png", 
                  plot = .,
                  width = 10.5,
                  height = 5),
         ggpubr::ggarrange(FIG_med_coef + ggtitle(label = "Outcome = Rank of Education")) %>% 
           ggsave("fig_E3.png", 
                  plot = .,
                  width = 10.5 * 0.65,
                  height = 5)
  )
  
  # ANALYSIS 3
  fit <- list()
  fit[[1]] <- imp_data %>%
    with(lm_robust(Y ~ A)) %>% pool() %>% tidy(conf.int = TRUE)
  fit[[2]] <- imp_data %>%
    with(lm_robust(Y ~ A + X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10 + X11)) %>% pool() %>% tidy(conf.int = TRUE)
  fit[[3]] <- imp_data %>%
    with(lm_robust(Y ~ A + M + X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10 + X11)) %>% pool() %>% tidy(conf.int = TRUE)
  fit[[4]] <- imp_data %>%
    with(lm_robust(Y ~ A + M + X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10 + X11 +
                     Z1 + Z2 + Z3 + Z4 + Z5 + Z6 + Z7 + Z8 + Z9 + Z10 + Z11 + Z12)) %>% pool() %>% tidy(conf.int = TRUE)
  fit[[5]] <- imp_data %>%
    with(lm_robust(Y ~ A * M + X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10 + X11)) %>% pool() %>% tidy(conf.int = TRUE)
  fit[[6]] <- imp_data %>%
    with(lm_robust(Y ~ A * M + X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10 + X11 + 
                     Z1 + Z2 + Z3 + Z4 + Z5 + Z6 + Z7 + Z8 + Z9 + Z10 + Z11 + Z12)) %>% pool() %>% tidy(conf.int = TRUE)
  fit |> map(~kable(.,digits = 3, format = "simple"))
  
  

# Table 3 -----------------------------------------------------------------
  ifelse(Y == "Yuniv", 
         write_csv(bind_rows(fit, .id = "Model"), "coef_reg_univ.csv"),
         write_csv(bind_rows(fit, .id = "Model"), "coef_reg_rank.csv")
  )
  
  FIG_coef <- 
    bind_rows(fit, .id = "Model") %>% 
    filter(term == "A") %>% 
    filter(Model <= 4) %>%
    mutate(Model = factor(Model, 
                          levels = 1:4,
                          labels = c("(1) Treatment Only ",
                                     "(2) Treatment + \n Pretreatment Covariates",
                                     "(3) Treatment + \n Pretreatment Covariates + \n Mediator",
                                     "(4) Treatment + \n Pretreatment Covariates + \n Mediator + \n Intermediate Covariates"))) %>% 
    ggplot(aes(x = Model, y = estimate, ymin = conf.low, ymax = conf.high)) +
    geom_hline(yintercept = 0, color = "black") + 
    geom_text(aes(label = paste0(format(round(estimate,3), nsmall = 3),"\n(",
                                 format(round(std.error,3), nmall = 3),")")),
              nudge_x = 0.2) + 
    geom_pointrange() + 
    labs(y = "Coefficient of Family Income") + 
    theme_minimal()
  FIG_coef

# Figures E2 and E4 ---------------------------------------------------------
  ifelse(Y == "Yuniv",
         FIG_coef %>% 
           ggsave("fig_E2.png", 
                  plot = .,
                  width = 7,
                  height = 4),
         FIG_coef %>% 
           ggsave("fig_E4.png", 
                  plot = .,
                  width = 7,
                  height = 4)
  )
  

# Regression with residuals -----------------------------------------------

  
  # ANALYSIS 4: RWR
  treatment <- "A"
  pre_cov <- c("X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8", "X9", "X10", "X11")
  # RWR with treatment induced-confounders
  y_form <- Y ~ A * (X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10 + X11 + M) + 
    M * (X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10 + X11 + 
           Z1 + Z2 + Z3 + Z4 + Z5 + Z6 + Z7 + Z8 + Z9 + Z10 + Z11 + Z12)
  m_form <- M ~ A * (X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10 + X11)
  #m_form <- M ~ A * (X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10 + X11 + Z1 + Z2 + Z3 + Z4 + Z5)
  
  RWR <- map(imps_raw, ~filter(.) %>% 
               rwrmed(treatment = treatment, 
                      pre_cov = pre_cov, 
                      zmodels = list(lm(Z1 ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10 + X11 + A,
                                        data = .),
                                     lm(Z2 ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10 + X11 + A,
                                        data = .),
                                     lm(Z3 ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10 + X11 + A,
                                        data = .),
                                     lm(Z4 ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10 + X11 + A,
                                        data = .),
                                     lm(Z5 ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10 + X11 + A,
                                        data = .),
                                     lm(Z5 ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10 + X11 + A,
                                        data = .),
                                     lm(Z6 ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10 + X11 + A,
                                        data = .),
                                     lm(Z7 ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10 + X11 + A,
                                        data = .),
                                     lm(Z8 ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10 + X11 + A,
                                        data = .),
                                     lm(Z9 ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10 + X11 + A,
                                        data = .),
                                     lm(Z10 ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10 + X11 + A,
                                        data = .),
                                     lm(Z11 ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10 + X11 + A,
                                        data = .),
                                     lm(Z12 ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10 + X11 + A,
                                        data = .)
                      ),
                      y_form = y_form,
                      m_form = m_form,
                      data = .))
  
  Appendix_table <- list()
  Appendix_table[[1]] <- 
    map(1:M_num,~RWR[[.]]$y_model) %>% 
    pool() %>% 
    tidy(conf.int = TRUE) 
  
  Appendix_table[[2]] <- 
    map(1:M_num,~RWR[[.]]$m_model) %>% 
    pool() %>% 
    tidy(conf.int = TRUE) 
  Appendix_table
  
  Appendix_table <- bind_rows(Appendix_table, .id = "model") %>% 
    mutate(model = factor(model, levels = 1:2, labels = c("y_model", "m_model")))
  
  coef_CDE <- map(1:M_num,~RWR[[.]]$y_model) %>% 
    pool() %>% 
    tidy(conf.int = TRUE) %>%
    filter(term == "A" | term == "A:M")
  
  coef_CDE %>%
    kable(digits = 3)
  
  decom <- list()
  tic()
  for (i in 1:21) {
    decom[[i]] <- map(RWR, ~decomp(., 
                                   a0 = -0.5, 
                                   a1 = 0.5, 
                                   m = (-0.50 + 0.05 * (i - 1)), 
                                   rep = REP) %$% 
                        fourcomp %>% 
                        as_tibble(rownames = "Effect")) %>%
      bind_rows(.id = "imp")
  }
  toc()

  decom2 <- list()
  tic()
  for (i in 11) {
    decom2[[i]] <- map(RWR, ~decomp(., 
                                    a0 = -0.5, 
                                    a1 = 0.5, 
                                    m = (-0.50 + 0.05 * (i - 1)), 
                                    rep = REP) %$% 
                         twocomp %>% 
                         as_tibble(rownames = "Effect")) %>%
      bind_rows(.id = "imp")
  }
  toc()

  decom_four <- decom %>% 
    bind_rows(.id = "M") %>%
    group_by(M, Effect) %>%
    summarise(
      theta = mean(Estimate),
      V_W = mean(SE ^ 2),
      V_B = var(Estimate),
      V_T = V_W + V_B + V_B / M_num,
      SE = sqrt(V_T)
    ) %>%
    mutate(M = (0.05 * (as.numeric(M) - 1))) %>%
    arrange(M) %>%
    print(n = Inf)
  
  
  decom_three <- decom2[[11]] %>% 
    bind_rows(.id = "M") %>%
    group_by(M, Effect) %>%
    summarise(
      theta = mean(Estimate),
      V_W = mean(SE ^ 2),
      V_B = var(Estimate),
      V_T = V_W + V_B + V_B / M_num,
      SE = sqrt(V_T)
    ) %>%
    mutate(M = (0.05 * (as.numeric(M) - 1))) %>%
    arrange(M) %>%
    print(n = Inf)
  
  
  FIG_RWR_Decom <- decom %>% 
    bind_rows(.id = "M") %>%
    group_by(M, Effect) %>% 
    summarise_if(is.numeric, mean) %>% 
    mutate(M = (0.05 * (as.numeric(M) - 1)),
           Effect = factor(Effect,
                           levels = c("rATE","CDE","rPIE","rINTREF","rINTMED"))) %>%
    ggplot(aes(x = M, y = Estimate, ymin = `2.5% Perc`, ymax = `97.5% Perc`)) +
    geom_line() + 
    geom_hline(yintercept = 0, color = "black") + 
    geom_ribbon(fill = "gray", alpha = 0.3) + 
    geom_point(aes(x = ifelse(M %in% c(0,0.25,0.5,0.75,1), M, NA))) + 
    geom_text(aes(label = round(Estimate,3), 
                  x = ifelse(M %in% c(0,0.25,0.5,0.75,1), M, NA)),
              nudge_y = 0.05) + 
    labs(x = "Standardized Rank Score of High School",
         y = "Effect") + 
    facet_wrap(~Effect) +
    theme_minimal() 
  FIG_RWR_Decom
  
  
  ifelse(Y == "Yuniv",
         FIG_RWR_Decom %>%
           ggsave("Appendix_fig_1.png", 
                  plot = .,
                  width = 10,
                  height = 5),
         FIG_RWR_Decom %>%
           ggsave("Appendix_fig_2.png", 
                  plot = .,
                  width = 10,
                  height = 5)
  )

    FIG_RWR <- decom %>% 
    bind_rows(.id = "M") %>%
    group_by(M,Effect) %>%
    summarise_if(is.numeric, mean) %>%
    filter(Effect == "CDE") %>%
    mutate(M = (0.05 * (as.numeric(M) - 1))) %>%
    ggplot(aes(x = M, y = Estimate, ymin = `2.5% Perc`, ymax = `97.5% Perc`)) +
    geom_line() + 
    geom_hline(yintercept = 0, color = "black") + 
    geom_ribbon(fill = "gray", alpha = 0.3) + 
    geom_point(aes(x = ifelse(M %in% c(0,0.25,0.5,0.75,1), M, NA))) + 
    geom_text(aes(label = paste0(round(Estimate,3),"\n(",round(SE,3),")"), 　
                  x = ifelse(M %in% c(0,0.25,0.5,0.75,1), M, NA)),
              nudge_y = 0.075) +   
    labs(x = "Standardized Rank Score of High School",
         y = "CDE (Controlled Direct Effects)") + 
    theme_minimal() 
  FIG_RWR

# Figures 3 and 4 ---------------------------------------------------------
  ifelse(Y == "Yuniv", 
         FIG_RWR %>%
           ggsave("fig_3.png", 
                  plot = .,
                  width = 6,
                  height = 4),
         FIG_RWR %>%
           ggsave("fig_4.png", 
                  plot = .,
                  width = 6,
                  height = 4)
  )
  
  ifelse(Y == "Yuniv", 
         write_csv(Appendix_table, "Appendix_table_univ.csv"),
         write_csv(Appendix_table, "Appendix_table_rank.csv")
  )
  
  ifelse(Y == "Yuniv", 
         write_csv(decom_four, "decom_four_univ.csv"),
         write_csv(decom_four, "decom_four_rank.csv")
  )

# Table 2 -----------------------------------------------------------------
  ifelse(Y == "Yuniv", 
         write_csv(decom_three, "decom_three_univ.csv"),
         write_csv(decom_three, "decom_three_rank.csv")
  )
  
  ifelse(Y == "Yuniv", 
         write_csv(coef_CDE, "coef_CDE.csv"),
         write_csv(coef_CDE, "coef_CDE_rank.csv")
  )
  
}



# G-formula ---------------------------------------------------------------
estimand <- tab_univ <- Evalues <- list()

set.seed(123456)
for (i in c("Yuniv", "Yrank")) {
  for (m in c(1:5)) {
    
    Y <- i
    
    ifelse(Y == "Yuniv",
           imp_amelia <- transform(imp_amelia, Y = Yuniv),
           imp_amelia <- transform(imp_amelia, Y = Yrank)
    )
    imp_amelia_raw <- imp_amelia
    imps_raw <- imp_amelia_raw$imputations
    imp_data <- datlist2mids(imp_amelia$imputations)
    
    imps_raw_merge <- 
      map(1:M_num, ~bind_rows(imps_raw[[.]])) %>% 
      bind_rows(.id = "imp")
    
    # g-formula
    res_gformula <- list()
    res_gformula <- map(1:M_num,~imps_raw[[.]] %>% 
                          cmest(data = ., model = "gformula", 
                                outcome = "Y",
                                exposure = "A", mediator = "M", 
                                basec = c("X1","X2","X3","X4","X5","X6","X7","X8","X9","X10","X11"),
                                postc = c("Z1","Z2","Z3","Z4","Z5","Z6","Z7","Z8","Z9","Z10","Z11","Z12"), EMint = TRUE,
                                yreg = "linear",
                                mreg = list(glm(M ~ A * (X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10 + X11 + 
                                                           Z1 + Z2 + Z3 + Z4 + Z5 + Z6 + Z7 + Z8 + Z9 + Z10 + Z11 + Z12), 
                                                family = gaussian, data = .)), 
                                postcreg = list(glm(Z1 ~ A + X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10 + X11, family = gaussian, data = .),
                                                glm(Z2 ~ A + X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10 + X11, family = gaussian, data = .),
                                                glm(Z3 ~ A + X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10 + X11, family = gaussian, data = .),
                                                glm(Z4 ~ A + X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10 + X11, family = gaussian, data = .),
                                                glm(Z5 ~ A + X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10 + X11, family = gaussian, data = .),
                                                glm(Z6 ~ A + X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10 + X11, family = gaussian, data = .),
                                                glm(Z7 ~ A + X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10 + X11, family = gaussian, data = .),
                                                glm(Z8 ~ A + X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10 + X11, family = gaussian, data = .),
                                                glm(Z9 ~ A + X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10 + X11, family = gaussian, data = .),
                                                glm(Z10 ~ A + X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10 + X11, family = gaussian, data = .),
                                                glm(Z11 ~ A + X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10 + X11, family = gaussian, data = .),
                                                glm(Z12 ~ A + X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10 + X11, family = gaussian, data = .)),
                                astar = -0.5, a = 0.5, mval = list((-0.50 + 0.25 * (m - 1))), 
                                estimation = "imputation", inference = "bootstrap", nboot = REP))
    
    estimand[[i]][[m]] <- map_dfr(1:M_num, ~res_gformula[[.]] %>% 
                                    summary %$% 
                                    summarydf %>% 
                                    rownames_to_column() %>%
                                    tibble(), .id = "imp") %>% 
      group_by(rowname) %>% 
      summarise(
        theta = mean(Estimate),
        V_W = mean(Std.error ^ 2),
        V_B = var(Estimate),
        V_T = V_W + V_B + V_B / M_num,
        SE = sqrt(V_T)
      )
    tab_univ[[i]][[m]] <- map_dfr(1:M_num, ~cmsens(object = res_gformula[[.]], sens = "uc") %$% 
                                    evalues %>% 
                                    data.frame() %>% 
                                    rownames_to_column() %>% 
                                    tibble(), .id = "imp")
    
  }
  
}


for (i in c("Yuniv", "Yrank")) {
  for (m in c(1:5)) {
    Evalues[[i]][[m]] <- tab_univ[[i]][[m]] %>% 
      group_by(rowname) %>% 
      filter(rowname %in% c("cde", "te", "rpnde", "rtnie")) %>%
      summarise(Evalue.estRR = median(Evalue.estRR),
                Evalue.lowerRR = median(Evalue.lowerRR)) %>% 
      mutate(order = c(1,2,3,4)) %>% 
      arrange(order) %>% 
      select(-order)
  }
}



estimand[[1]] |> 
  bind_rows(.id = "M") |> 
  filter(rowname %in% c("te", "rpnde", "rtnie")) |> 
  filter(M == 3) |> 
  mutate(order = c(3,2,1)) |> 
  arrange(order) %>% 
  select(-order) |> write_csv( 
    paste0("Estimand_gformula_",1,".csv"))


estimand[[2]] |> 
  bind_rows(.id = "M") |> 
  filter(rowname %in% c("te", "rpnde", "rtnie"))|> 
  filter(M == 3) |> 
  mutate(order = c(3,2,1)) |> 
  arrange(order) %>% 
  select(-order)|> write_csv( 
    paste0("Estimand_gformula_",2,".csv"))


# Figure D1 ---------------------------------------------------------------

fig_gformula_1 <- estimand[[1]] |> 
  bind_rows(.id = "M") |> 
  mutate(M = (0.25 * (as.numeric(M) - 1))) |> 
  filter(rowname == "cde") |> 
  ggplot(aes(x = M, y = theta, ymin = theta - qnorm(0.025) * SE, ymax = theta + qnorm(0.025) * SE)) +
  geom_line() + 
  geom_hline(yintercept = 0, color = "black") + 
  geom_ribbon(fill = "gray", alpha = 0.3) + 
  geom_point(aes(x = ifelse(M %in% c(0,0.25,0.5,0.75,1), M, NA))) + 
  geom_text(aes(
    label = paste0(format(round(theta, 3), nsmall = 3), "\n(", format(round(SE, 3), nsmall = 3), ")"), 　
    x = ifelse(M %in% c(0, 0.25, 0.5, 0.75, 1), M, NA)
  ),
  nudge_y = 0.075) +
  labs(x = "Standardized Rank Score of High School",
       y = "CDE (Controlled Direct Effects)") +
  theme_minimal() 

fig_gformula_1 %>%
  ggsave("fig_D1.png", 
         plot = .,
         width = 6,
         height = 4)

# Figure D2 ---------------------------------------------------------------

fig_gformula_2 <- estimand[[2]] |> 
  bind_rows(.id = "M") |> 
  mutate(M = (0.25 * (as.numeric(M) - 1))) |> 
  filter(rowname == "cde") |> 
  ggplot(aes(x = M, y = theta, ymin = theta - qnorm(0.025) * SE, ymax = theta + qnorm(0.025) * SE)) +
  geom_line() + 
  geom_hline(yintercept = 0, color = "black") + 
  geom_ribbon(fill = "gray", alpha = 0.3) + 
  geom_point(aes(x = ifelse(M %in% c(0,0.25,0.5,0.75,1), M, NA))) + 
  geom_text(aes(
    label = paste0(format(round(theta, 3), nsmall = 3), "\n(", format(round(SE, 3), nsmall = 3), ")"), 　
    x = ifelse(M %in% c(0, 0.25, 0.5, 0.75, 1), M, NA)
  ),
  nudge_y = 0.075) +
  labs(x = "Standardized Rank Score of High School",
       y = "CDE (Controlled Direct Effects)") +
  theme_minimal() 

fig_gformula_2 %>%
  ggsave("fig_D2.png", 
         plot = .,
         width = 6,
         height = 4)

# Figures 5 and 6 ---------------------------------------------------------------

fig_evalue_1 <- 
  Evalues$Yuniv |> 
  bind_rows(.id = "M") |> 
  filter(rowname == "cde") |> 
  pivot_longer(cols = Evalue.estRR:Evalue.lowerRR) |>
  mutate(`E-value` = case_match(name,
                                "Evalue.estRR" ~ "E-value",
                                "Evalue.lowerRR" ~ "E-value (LL)"),
         M = (0.25 * (as.numeric(M) - 1))) |> 
  ggplot(aes(x = M, y = value, linetype = `E-value`)) + 
  geom_line() +
  geom_point(aes(x = ifelse(M %in% c(0,0.25,0.5,0.75,1), M, NA))) + 
  geom_text(aes(
    label = paste0(format(round(value, 3), nsmall = 3)), 　
    x = ifelse(M %in% c(0, 0.25, 0.5, 0.75, 1), M, NA)
  ),
  nudge_y = 0.12) +
  labs(x = "Standardized Rank Score of High School",
       y = "Evalue") +
  theme_minimal() +
  theme(legend.position = "bottom")
fig_evalue_1 |> 
  ggsave("fig_5.png",  # D3
         plot = _,
         width = 6,
         height = 4)

fig_evalue_2 <- 
  Evalues$Yrank |> 
  bind_rows(.id = "M") |> 
  filter(rowname == "cde") |> 
  pivot_longer(cols = Evalue.estRR:Evalue.lowerRR) |>
  mutate(`E-value` = case_match(name,
                                "Evalue.estRR" ~ "E-value",
                                "Evalue.lowerRR" ~ "E-value (LL)"),
         M = (0.25 * (as.numeric(M) - 1))) |> 
  ggplot(aes(x = M, y = value, linetype = `E-value`)) + 
  geom_line() +
  geom_point(aes(x = ifelse(M %in% c(0,0.25,0.5,0.75,1), M, NA))) + 
  geom_text(aes(
    label = paste0(format(round(value, 3), nsmall = 3)), 　
    x = ifelse(M %in% c(0, 0.25, 0.5, 0.75, 1), M, NA)
  ),
  nudge_y = 0.12) +
  labs(x = "Standardized Rank Score of High School",
       y = "E-value") +
  theme_minimal() +
  theme(legend.position = "bottom")
fig_evalue_2 |> 
  ggsave(plot = _, 
         filename = "fig_6.png",  # D4
         width = 6,
         height = 4)


# Table 4 -----------------------------------------------------------------

Evalues_1 <- tab_univ$Yuniv %>% 
  bind_rows() |> 
  group_by(rowname) %>% 
  filter(rowname %in% c("te", "rpnde", "rtnie")) %>%
  summarise(Evalue.estRR = median(Evalue.estRR),
            Evalue.lowerRR = median(Evalue.lowerRR)) %>% 
  mutate(order = c(2,3,1)) %>% 
  arrange(order) %>% 
  select(-order)

Evalues_2 <- tab_univ$Yrank %>% 
  bind_rows() |> 
  group_by(rowname) %>% 
  filter(rowname %in% c("te", "rpnde", "rtnie")) %>%
  summarise(Evalue.estRR = median(Evalue.estRR),
            Evalue.lowerRR = median(Evalue.lowerRR)) %>% 
  mutate(order = c(2,3,1)) %>% 
  arrange(order) %>% 
  select(-order)

write_csv(Evalues_1, 
          paste0("Evalues_",1,".csv"))

write_csv(Evalues_2, 
          paste0("Evalues_",2,".csv"))